
/***********************************************
"Male Earnings, Marriageable Men, and Non-marital Fertility: Evidence from the Fracking Boom"

Melissa Kearney and Riley Wilson

This .do file contains the coded needed to prepare the Vital Statistics Birth Data
for analysis and merge it with the existing data.

Data needed:
1990-2013  County Geocoded Vital Statistics (because of data limitations for other datasources,
our analysis only ranges from 1997-2012.

************************************************/
clear all
set more off
*set globals 
//Folder where data will be saved
global data ""
//Folder where the Vital Statistics data is currently located
global vsdata ""


/********************************
PREPARE COUNTY MEASURES OF VITAL STATISTICS BIRTH DATA
********************************/
*Create a master file of all counties and all years
cd $data
use cz_counties, clear
sort ctyfips
keep ctyfips
expand 24 //24 years
bys ctyfips: gen n = _n
gen yrconcept = 1989+n
drop n

foreach var in tot mar nmar {
	foreach g in b fb lscollb lscollfb collb collfb bnomove fbnomove {
		gen `var'`g' = 0
	}
	foreach a in 15_17 18_19 15_19 20_34 35_44 {
		foreach g in b fb lscollb lscollfb collb collfb bnomove firstbnomove {
			gen `var'`g'`a' = 0
		}
	}
}
cd $vsdata
save ctyall_yrs90_13, replace

//RESTRICTED VITAL STATS DATA
	
local datasets = `" "Nat03uspt2.dat" "NAT04USpt2.dat" "NAT05USpt2.dat" "NAT06USpt2.dat" "VS07NATL.USPART2" "VS08NATL.DETAILUS.PART2" "nat2009us.part2AllCounties" "nat2010us.part2AllCounties" "nat2011us.part2AllCounties" "nat2012us.part2AllCounties" "nat2013us.part2AllCounties" "'
forval yr = 2003/2013 {
	local i = `yr'-2002
	local dataset : word `i' of `datasets'
	cd $vsdata
	cd vs`yr'
	
	infix yrbirth 15-18 mobirth 19-20 str stbirth 30-36	ctybirth 37-39 momagegroup	89-90 str mombirthst 98-99	///
	str momst 109-110 momcty 114-116 sameres 137 momrace 143 momhispanic 148 mommarst 153 mommarst_impute 154 momeduc1 155 momeduc2 156-157 momeduc3 158 ///
	dad_rptageused 175	dadage 182-183 dadage2 184-185 dadage3 186-187 dadhisp 196	dadeduc 197	dadrace 199-200	///
	livebirthorder 212 totalbirthorder 217 mollbirth 220-221 yrllbirth 222-225 gestation 451-452 gest2 446-447	///	
	using `dataset', clear

	gen stateabbreviation = substr(momst,-2,.)
	*Identify the state FIPS
	cd $data
	merge m:1 stateabbreviation using fips_states
	drop if _m == 1 //AB, BC, MB, MP, ON, QC, SK, XX, ZZ
	drop if _m == 2 //Outside Territories
	drop _m

	replace gestation = gest2 if gestation == 99 & gest2 ~= 99 //replace with OBGYN estimated gestation if missing
	replace gestation = 39 if gestation == 99 
	//We will assume that births where gestation is unknown, were at the median (39 weeks)
	*Identify Year of conception
	gen conceptdate = mdy(mobirth,15,yrbirth)-(gestation*7) //date of birth minus days of gestation
	gen moconcept = month(conceptdate)
	gen yrconcept = year(conceptdate)
	gen qconcept = quarter(conceptdate)
	if `yr' == 2003 {
		gen momage = momagegroup+13 //currently 1 is under 15
	}
	if `yr' >=2004 {
		gen momage = momagegroup // currently the group is the age except for 10-12
		replace momage = 14 if momagegroup<14 //now I will have the same groups: 14 and under, then each age year
	}
	assert momage>0
	/*Different states recorded mother's education differently. I need to combine them 
	all into one measure 
	CURRENTLY, I AM USING AGE RATHER THAN EDUCATION, BUT THIS WILL BE USED LATER*/
	gen momeduc = .
	*<HS (momeduc1<3 or momeduc2<12)
	replace momeduc = 1 if inlist(momeduc1,1,2) & momeduc2 == . 
	replace momeduc = 1 if momeduc2<12 & momeduc1 == . 
	*HS (momeduc1 = 3 or momeduc2 = 12)
	replace momeduc = 2 if momeduc1 == 3 & momeduc2 == .
	replace momeduc = 2 if momeduc2 == 12 & momeduc1 == . 
	*Some College (momeduc1 =4,5 or momeduc2 = 13,14,15)
	replace momeduc = 3 if inlist(momeduc1,4,5) & momeduc2 == . 
	replace momeduc = 3 if inlist(momeduc2,13,14,15) & momeduc1 == . 
	*College or More (momeduc1 = 6,7,8 or momeduc2 = 16,17)
	replace momeduc = 4 if inlist(momeduc1,6,7,8) & momeduc2 == . 
	replace momeduc = 4 if inlist(momeduc2,16,17) & momeduc1 == . 
	
	gen lscoll = momeduc <=3
	gen coll = momeduc == 4
	//PULL OUT MOTHER'S THAT WE CARE ABOUT 
	//15-17
	gen momage15_17 = momage>=15 & momage<=17
	//18-34
	gen momage18_34 = momage>=18 & momage<=34
	//35-44
	gen momage35_44 = momage>=35 & momage<=44
	*Gen Measures for Birth in Each Group
	gen totb = 1
	*Get measure of marital birth
	gen marb = 1 if mommarst == 1
	*Get measure of non-marital birth
	gen nmarb= 1 if mommarst >=2 & mommarst <=3
	
	//Indicate the race of the mother
	gen momwhitenh = momrace == 1 & momhispanic == 0
	gen momblacknh = momrace == 2 & momhispanic == 0
	gen momhisp = momhispanic > 0 & momhispanic <9 //excludes origin unknown or unstated
	gen momothernh = (~inlist(momrace,1,2) & momhispanic == 0) | momhispanic == 9 //origin unknown 
	//Indicate if the mother is living in the same state as state of birth
	gen momsamestate = mombirthst == momst
	//Indicate if first birth
	gen fb = totalbirthorder == 1
	
	foreach var in tot mar nmar {
		gen `var'fb = 1 if `var'b == 1 & fb == 1
		gen `var'lscollb = 1 if `var'b == 1 & lscoll == 1
		gen `var'lscollfb = 1 if `var'b == 1 & lscoll == 1 & fb == 1
		gen `var'collb = 1 if `var'b == 1 & coll == 1
		gen `var'collfb = 1 if `var'b == 1 & coll == 1 & fb == 1
		foreach r in whitenh blacknh hisp othernh {
			gen `var'b`r' = 1 if `var'b == 1 & mom`r' == 1 
		}
		foreach a in 15_17 18_34 35_44 {
			gen `var'b`a' = 1 if `var'b == 1 & momage`a' == 1
			gen `var'fb`a' = 1 if `var'b == 1 & momage`a' == 1 & fb == 1
			//By education
			gen `var'lscollb`a' = 1 if `var'b == 1 & momage`a' == 1 & lscoll == 1
			gen `var'lscollfb`a' = 1 if `var'b == 1 & momage`a' == 1 & lscoll == 1 & fb == 1
			
			gen `var'collb`a' = 1 if `var'b == 1 & momage`a' == 1 & coll == 1
			gen `var'collfb`a' = 1 if `var'b == 1 & momage`a' == 1 & coll == 1 & fb == 1
			
			//non-movers
			gen `var'bnomove`a' = 1 if `var'b == 1 & momage`a' == 1 & momsamestate == 1
			gen `var'fbnomove`a' = 1 if `var'b == 1 & momage`a' == 1 & fb == 1 & momsamestate == 1
			//By race
			foreach r in whitenh blacknh hisp othernh {
				gen `var'b`r'`a' = 1 if `var'b == 1 & momage`a' == 1 & mom`r' == 1 
			}
		}
	}
	assert marb == . if nmarb == 1
	assert totcollb18_34 == . if totlscollb18_34 == 1
	gen ctyfips = (fipscode*1000)+momcty
	/*Some births don't have mother's state and county of residence recorded
	we will replace this with the recorded state and birth of occurance, in 2003 
	the county was the same in 74% of all births, this usually only is a small group of people in 2003: 7 births*/
	replace ctyfips = (fipscode*1000)+ctybirth if momcty == 0 
	replace ctyfips = 12086 if ctyfips == 12025 //Update the fips code to the current for Miami Dade
	replace ctyfips = 51083 if ctyfips == 51780 // South Boston in VA merged with Halifax county
	replace ctyfips = 51005 if ctyfips == 51560 // Clifton Forge in VA merged with Alleghany county
	replace ctyfips = 30067 if ctyfips == 30113 // Yellowstone added to Park County Montana
	//Get annual measure
	
	*Get County Level Measures
	cd $vsdata
	append using cty_allyrq90_13 // tacks on one observation for each county so counties with no births are still present
	keep if yrconcept == `yr' | yrconcept == `yr'-1
	keep totb* marb* nmarb* totfb* marfb* nmarfb* totlscoll* marlscoll* nmarlscoll* totcoll* marcoll* nmarcoll* ctyfips yrconcept qconcept
	compress
	//quarterly
	collapse (sum) totb* marb* nmarb* totfb* marfb* nmarfb* ///
		totlscoll* marlscoll* nmarlscoll* totcoll* marcoll* nmarcoll* , by(ctyfips yrconcept qconcept)
	drop if yrconcept == .
	
	cd $vsdata
	save vscty_birthsq`yr', replace
	//annual
	collapse (sum) totb* marb* nmarb* totfb* marfb* nmarfb* ///
		totlscoll* marlscoll* nmarlscoll* totcoll* marcoll* nmarcoll*, by(ctyfips yrconcept)
	drop if yrconcept == .
	cd $vsdata
	save vsctybinsannual`yr', replace
}
*Now I need to do the years before 2003 separately

cd $vsdata
insheet using mom_stbirthnum.txt, delimiter(",") names clear
save momstbirth_xwalk, replace
local datasets = `" "Natl1990.pt2" "Natl1991.pt2" "Natl1992.pt2" "Natl1993.pt2" "Nat94us.txt" "Nat95us.txt" "Nat96p2.us" "Natl1997.pt2" "Natl98p2.us" "Natl99us.pt2" "Natl00us.pt2" "Nat01us.pt2" "Nat02us.dat" "'

forval yr = 1990/2002 {
	local i = `yr'-1989
	local dataset : word `i' of `datasets'
	cd $vsdata
	cd vs`yr'
	
	infix stbirth 21-22 ctybirth 23-25 momst 42-43 momcty 44-46 momagegroup 72-73 momhispanic 77 momrace 80-81 momeduc1 83-84 momeduc2 85 ///
		mommarst_flag 86 mommarst 87 mombirthst 88-89 livebirthorder 102 totalbirthorder 105 dadage 154-155 mobirth 172-173 yrbirth 176-179  ///
		gestation 183-184 ///
		using `dataset', clear
		
	replace gestation = 39 if gestation == 99 //replace with median 39 weeks, no separate clinical estimate
	assert gestation <90
	*Identify Year of conception
	gen conceptdate = mdy(mobirth,15,yrbirth)- (gestation*7) //date of birth minus days of gestation
	gen moconcept = month(conceptdate)
	gen yrconcept = year(conceptdate)
	gen qconcept = quarter(conceptdate)
	
	gen momage = momagegroup+13 //14 is 14 and under
	
	assert momage>0
	*Get identical measure of mother's education
	gen momeduc = .
	replace momeduc = 1 if momeduc1<=11
	replace momeduc = 2 if momeduc1 == 12 
	replace momeduc = 3 if inlist(momeduc1,13,14,15) 
	replace momeduc = 4 if inlist(momeduc1,16,17) 
	
	assert momeduc == . if momeduc1 == 99
	gen lscoll = momeduc <=3
	gen coll = momeduc == 4
	//PULL OUT MOTHER'S THAT WE CARE ABOUT 
	//15-17
	gen momage15_17 = momage>=15 & momage<=17
	//18-34
	gen momage18_34 = momage>=18 & momage<=34
	//35-44
	gen momage35_44 = momage>=35 & momage<=44

	*Gen Measures for Birth in Each Group
	gen totb = 1
	*Get measure of marital birth
	gen marb = 1 if mommarst == 1
	*Get measure of non-marital birth
	gen nmarb = 1 if mommarst >=2 & mommarst <=3
	
	//Indicate the race of the mother
	gen momwhitenh = momrace == 1 & momhispanic == 0
	gen momblacknh = momrace == 2 & momhispanic == 0
	gen momhisp = momhispanic > 0 & momhispanic <9 //excludes origin unknown or unstated
	gen momothernh = (~inlist(momrace,1,2) & momhispanic == 0) | momhispanic == 9 //origin unknown 
	//Indicate if mother has moved from state of birth
	cd $vsdata
	merge m:1 mombirthst using momstbirth_xwalk
	gen momforeignborn = _m == 1
	drop if _m == 2 
	drop _m
	gen momsamestate = mombirthstfips == momst
	//Indicate if first birth
	gen fb = totalbirthorder == 1
	
	foreach var in tot mar nmar {
		gen `var'fb = 1 if `var'b == 1 & fb == 1
		gen `var'lscollb = 1 if `var'b == 1 & lscoll == 1
		gen `var'lscollfb = 1 if `var'b == 1 & lscoll == 1 & fb == 1
		gen `var'collb = 1 if `var'b == 1 & coll == 1
		gen `var'collfb = 1 if `var'b == 1 & coll == 1 & fb == 1
		foreach r in whitenh blacknh hisp othernh {
			gen `var'b`r' = 1 if `var'b == 1 & mom`r' == 1 
		}
		foreach a in 15_17 18_34 35_44 {
			gen `var'b`a' = 1 if `var'b == 1 & momage`a' == 1
			gen `var'fb`a' = 1 if `var'b == 1 & momage`a' == 1 & fb == 1
			// By education
			gen `var'lscollb`a' = 1 if `var'b == 1 & momage`a' == 1 & lscoll == 1
			gen `var'lscollfb`a' = 1 if `var'b == 1 & momage`a' == 1 & lscoll == 1 & fb == 1
			
			gen `var'collb`a' = 1 if `var'b == 1 & momage`a' == 1 & coll == 1
			gen `var'collfb`a' = 1 if `var'b == 1 & momage`a' == 1 & coll == 1 & fb == 1
			//non-movers
			gen `var'bnomove`a' = 1 if `var'b == 1 & momage`a' == 1 & momsamestate == 1
			gen `var'fbnomove`a' = 1 if `var'b == 1 & momage`a' == 1 & fb == 1 & momsamestate == 1
			//By race
			foreach r in whitenh blacknh hisp othernh {
				gen `var'b`r'`a' = 1 if `var'b == 1 & momage`a' == 1 & mom`r' == 1 
			}
		}
	}
	assert marb == . if nmarb == 1
	gen ctyfips = (momst*1000)+momcty
	/*Some births don't have mother's state and county of residence recorded
	we will replace this with the recorded state and birth of occurance, in 1990 
	the state was the same in 97% of all births and the pair was correct in 76 % 
	of all births */
	replace ctyfips = (stbirth*1000)+ctybirth if momst == 0 & momcty == 0
	replace ctyfips = 12086 if ctyfips == 12025 //Update the fips code to the current for Miami Dade
	replace ctyfips = 51083 if ctyfips == 51780 // South Boston in VA merged with Halifax county
	replace ctyfips = 51005 if ctyfips == 51560 // Clifton Forge in VA merged with Alleghany county
	replace ctyfips = 30067 if ctyfips == 30113 // Yellowstone added to Park County Montana
	*Get County Level Measures
	//Annual
	cd $vsdata
	append using cty_allyrq90_13 // tacks on one observation for each county so counties with no births are still present
	keep if yrconcept == `yr' | yrconcept == `yr'-1
	keep totb* marb* nmarb* totfb* marfb* nmarfb* totlscoll* marlscoll* nmarlscoll* totcoll* marcoll* nmarcoll* ctyfips yrconcept qconcept
	compress
	//county, quarterly
	collapse (sum) totb* marb* nmarb* totfb* marfb* nmarfb* ///
		totlscoll* marlscoll* nmarlscoll* totcoll* marcoll* nmarcoll*, by(ctyfips yrconcept qconcept)
	drop if yrconcept == .
	
	cd $vsdata 
	save vscty_birthsq`yr', replace
	//annual
	collapse (sum) totb* marb* nmarb* totfb* marfb* nmarfb* ///
		totlscoll* marlscoll* nmarlscoll* totcoll* marcoll* nmarcoll*, by(ctyfips yrconcept)
	drop if yrconcept == .
	
	cd $vsdata
	save vsctybinsannual`yr', replace
}

*Combine them all together:
cd $vsdata
use vsctybinsannual1990, clear
forval yr = 1991/2013 {
	append using vsctybinsannual`yr'
}
collapse (sum) totb* marb* nmarb* totfb* marfb* nmarfb* ///
		totlscoll* marlscoll* nmarlscoll* totcoll* marcoll* nmarcoll*, by( yrconcept ctyfips)
save vs_ctyconceptyr_annual, replace

cd $vsdata
use vscty_birthsq1990, clear
forval yr = 1991/2013 {
	append using vscty_birthsq`yr'
}
collapse (sum) totb* marb* nmarb* totfb* marfb* nmarfb* ///
		totlscoll* marlscoll* nmarlscoll* totcoll* marcoll* nmarcoll*, by(yrconcept qconcept ctyfips)
save vs_ctyconceptyr_quarterly, replace

////////////////////////////////////////////
/*******************************************
COMPILE ALL OF THE DATA INTO THE DATASET I 
WILL USE FOR MY FINAL ESTIMATION

FIRST I WILL USE QUARTERLY DATA TO IDENTIFY COUNTIES WITH OVER 10 BIRTHS A QUARTER, 
THEN I WILL CREATE THE ANNUAL DATASET
*******************************************/
cd $vsdata
use vs_ctyconceptyr_quarterly, clear
keep if yrconcept >= 1999 & yrconcept <= 2012
drop if inlist(floor(ctyfips/1000),2,15)
drop if floor(ctyfips/1000)>=60
bys ctyfips: egen minbirths = min(totb)
gen over10samp = minbirths>10
collapse (max) over10samp, by(ctyfips)
cd $data
save over10sampcty, replace

/*************************
COLLAPSE FROM COUNTY TO PUMA
*************************/
cd $data
use cty_to_puma2000, clear

bys ctyfips: egen totpop00 = sum(pop2k)
gen double popwt = pop2k/totpop00
gsort ctyfips -popwt
bys ctyfips: gen n = _n
reshape wide puma00 pop2k popwt, i(ctyfips) j(n) 
cd $vsdata
merge 1:m ctyfips using vs_ctyconceptyr_annual
drop if _m == 2 //Broomfield County, Alaska, Hawaii, and territories
drop _m
drop if yrconcept<1997
rename yrconcept year
drop if ctyfips == 51515 //This county is excluded because it has too few births

//Creat Non-First Birth measures
foreach var in tot mar nmar {
	foreach a in 15_17 18_34 35_44 {
		gen `var'nfb`a' = `var'b`a'-`var'fb`a'
	}
}
//Rename the Race measures to align with other datasets
foreach var in tot mar nmar {
	foreach r in white black other {
		foreach a in "" "15_17" "18_34" "35_44" {
			rename `var'b`r'nh`a' `var'bnh`r'`a'
		}
	}
}	
drop if year == 2013

reshape long puma00 pop2k popwt, i(ctyfips year) j(pumanum)
drop if puma00 == .

cd $data
save vitalstats_ctypumapair_raw, replace

/************************************************
//NOW I NEED TO COLLAPSE TO THE PUMA LEVEL AND CREATE THE VARIABLES OF INTEREST
************************************************/

cd $data
use vitalstats_ctypumapair_raw, clear

collapse (sum) tot* mar* nmar* ///
	[pw = popwt] , by(stfips puma00 year)
	
cd $data
merge 1:1 puma00 stfips year using analysissamp_novitalstats, gen(vs_merge)
drop if vs_merge == 1 
/* These observations are Montana PUMA, North Dakota PUMA, and the PUMA in Texas 
that corresponds to Webb County Texas. These PUMA are excluded from the analysis 
as outlined in the text*/
//BIRTH RATES
foreach var in tot mar nmar {
	gen ihs`var'b = ln(`var'b+sqrt(`var'b^2+1))
	foreach a in 15_17 18_34 35_44 {
		gen ihs`var'b`a' = ln(`var'b`a'+sqrt(`var'b`a'^2+1))
		gen `var'br`a' = (`var'b`a'/fpop`a')*1000
		gen ihs`var'br`a' = ln(`var'br`a'+sqrt(`var'br`a'^2+1))
		gen ln`var'br`a' = ln(`var'br`a')
		//by birth parity 
		gen `var'fbr`a' = (`var'fb`a'/fpop`a')*1000
		gen ln`var'fbr`a' = ln(`var'fbr`a')
		gen `var'nfbr`a' = (`var'nfb`a'/fpop`a')*1000
		gen ln`var'nfbr`a' = ln(`var'nfbr`a')
		//by race
		foreach r in nhwhite nhblack nhother hisp {
			gen `var'br`r'`a' = (`var'b`r'`a'/f`r'pop`a')*1000
			gen ln`var'br`r'`a' = ln(`var'br`r'`a')
		}
		
	}
	//WE DO NOT HAVE POP BY EDUC FOR 15-17, BUT THEY SHOULD ALL BE <COLL
	foreach a in 18_34 35_44 {
		foreach e in lscoll coll {
			gen ihs`var'`e'b`a' = ln(`var'`e'b`a'+sqrt(`var'`e'b`a'^2+1))
			gen `var'`e'br`a' = (`var'`e'b`a'/flscollpop`a')*1000
			gen ihs`var'`e'br`a' = ln(`var'`e'br`a'+sqrt(`var'`e'br`a'^2+1))
			gen ln`var'`e'br`a' = ln(`var'`e'br`a')
		}
	}
}
foreach a in 15_17 18_34 35_44 {
	gen sharenmar`a' = (nmarb`a'/totb`a')*100
	gen ihssharenmar`a' = ln(sharenmar`a'+sqrt(sharenmar`a'^2+1))
	gen lnsharenmar`a' = ln(sharenmar`a')
	//by birth parity
	gen sharefbnmar`a' = (nmarfb`a'/totfb`a')*100
	gen sharenfbnmar`a' = (nmarnfb`a'/totnfb`a')*100
	//by race
	foreach r in nhwhite nhblack nhother hisp {
		gen sharenmar`r'`a' = (nmarb`r'`a'/totb`r'`a')*100
	}
}
foreach a in 18_34 35_44 {
	foreach e in lscoll coll {
		gen share`e'nmar`a' = (nmar`e'b`a'/tot`e'b`a')*100
		gen ihsshare`e'nmar`a' = ln(share`e'nmar`a'+sqrt(share`e'nmar`a'^2+1))
		gen lnshare`e'nmar`a' = ln(share`e'nmar`a')
	}
}

//Pull out the number of births to 18-34 year olds in 2000 for weighting
gen t = totb18_34 if year == 2000
bys stpuma: egen totb2000 = max(t)
drop t

cd $data
save analysissamp_alldata, replace
